This document contains all the analyses done on ONT DNA data that was generated for the tailfindr paper. Knit this R markdown file after you have successfully run drake::r_make().
Load the required libraries first:
pacman::p_load(dplyr, magrittr, ggplot2, drake,
knitr, ggpubr, here, tidyverse, ggExtra)
Now load the data:
loadd(dna_kr_data)
# make a version of data in which tail lengths are capped to 300
dna_kr_data_capped <- dna_kr_data %>%
mutate(tail_length_ff = ifelse(tail_length_ff > 300, 300, tail_length_ff)) %>%
mutate(tail_length_st = ifelse(tail_length_st > 300, 300, tail_length_st))
Here is a description of columns of dna_kr_data:
knitr::kable(col_names_df)
| Columns | Description |
|---|---|
| read_id | Read ID |
| tail_start_ff | tailfindr estimate of poly(A) start site based on flipflop basecalling |
| tail_end_ff | tailfindr estimate of poly(A) end site based on flipflop basecalling |
| samples_per_nt_ff | tailfindr estimate of read-specific translocation rate in units of samples per nucleotide based on flipflop basecalling |
| tail_length_ff | tailfindr estimate of poly(A) tail length based on flipflop basecalling |
| tail_start_st | tailfindr estimate of poly(A) start site from standard model basecalling |
| tail_end_st | tailfindr estimate of poly(A) end site from standard model basecalling |
| samples_per_nt_st | tailfindr estimate of read-specific translocation rate in units of samples per nucleotide from dstandard model basecalling |
| tail_length_st | tailfindr estimate of poly(A) tail length from standard model basecalling |
| read_type | Whether the read is poly(A) or poly(T) read |
| barcode | Expected poly(A)/(T) tail length from spikeins |
| replicate | Replicate No |
| file_path | Full file path (relevant only for use within Valen lab) |
| transcript_alignment_start_st | Location of tail end by eGFP sequence alignment (standard model basecalling) |
| transcript_alignment_start_ff | Location of tail end by eGFP sequence alignment (flipflop model basecalling) |
| moves_in_tail_st | Moves between the tail boundaries in data basecalled using standard model |
| moves_in_tail_ff | Moves between the tail boundaries in data basecalled using flipflop model |
# define the function for computing standard error
std_err <- function(x) sd(x, na.rm = TRUE)/sqrt(length(x))
# summarize the data and display a table
summary_data <- dna_kr_data %>% group_by(barcode, read_type) %>%
summarise(read_count = n(),
mean = mean(tail_length_st, na.rm = TRUE),
median = median(tail_length_st, na.rm = TRUE),
std_dev = sd(tail_length_st, na.rm = TRUE),
std_err = std_err(tail_length_st))
summary_data %<>% mutate(cof_var = std_dev/mean)
kable(summary_data)
| barcode | read_type | read_count | mean | median | std_dev | std_err | cof_var |
|---|---|---|---|---|---|---|---|
| 10 | polyA | 5462 | 21.26826 | 13.060 | 25.56187 | 0.3458730 | 1.2018789 |
| 10 | polyT | 11072 | 16.22762 | 12.120 | 19.80606 | 0.1882284 | 1.2205155 |
| 30 | polyA | 13063 | 34.44375 | 31.210 | 16.90389 | 0.1478990 | 0.4907680 |
| 30 | polyT | 17087 | 31.65191 | 29.980 | 15.14715 | 0.1158772 | 0.4785540 |
| 40 | polyA | 6946 | 42.09980 | 40.290 | 17.44976 | 0.2093737 | 0.4144857 |
| 40 | polyT | 13811 | 39.02614 | 39.480 | 16.64152 | 0.1416056 | 0.4264200 |
| 60 | polyA | 8261 | 57.68607 | 59.140 | 18.89761 | 0.2079173 | 0.3275940 |
| 60 | polyT | 10072 | 53.26727 | 59.110 | 24.55896 | 0.2447103 | 0.4610517 |
| 100 | polyA | 3015 | 93.58427 | 96.820 | 23.11061 | 0.4208891 | 0.2469497 |
| 100 | polyT | 3166 | 91.69956 | 101.175 | 34.41122 | 0.6115678 | 0.3752604 |
| 150 | polyA | 1767 | 126.09282 | 138.460 | 41.75618 | 0.9933504 | 0.3311543 |
| 150 | polyT | 2535 | 119.84817 | 130.150 | 50.31467 | 0.9993224 | 0.4198201 |
To find out how robust the tail length estimated by tailfindr is across technical replicates:
p <- ggplot(dna_kr_data_capped, aes(x = barcode, y = tail_length_st,
color = replicate, fill = replicate)) +
geom_two_sided_flat_violin(position = position_nudge(x = 0, y = 0), alpha = .5) +
facet_grid(~barcode, scales = 'free') +
geom_hline(aes(yintercept = as.numeric(as.character(barcode))))
To address whether estimated tail lengths of poly(A) and poly(T) reads are comparable:
p <- ggplot(dna_kr_data_capped, aes(x = barcode, y = tail_length_st,
color = read_type, fill = read_type)) +
geom_two_sided_flat_violin(position = position_nudge(x = 0, y = 0), alpha = .5) +
facet_grid(~barcode, scales = 'free') +
geom_hline(aes(yintercept = as.numeric(as.character(barcode))))
## quartz_off_screen
## 2
To test whether basecalling strategy (standard model vs flip-flop model basecalling) has an influence on poly(A) length estimation:
# make data first
long_dna_data <- dna_kr_data %>% select(barcode, tail_length_ff, tail_length_st) %>%
gather(key = 'basecaller', value = 'tail_length', tail_length_ff, tail_length_st)
p <- ggplot(long_dna_data, aes(x = barcode, y = tail_length,
color = basecaller, fill = basecaller)) +
geom_two_sided_flat_violin(position = position_nudge(x = 0, y = 0), alpha = .5) +
facet_grid(~barcode, scales = 'free') +
geom_hline(aes(yintercept = as.numeric(as.character(barcode))))
First a new column transcript_end_tfis produced in our dataset. This column holds the tailfindr boundary location which is adjacent to transcript:
dna_kr_data %<>%
mutate(transcript_end_tf = ifelse(read_type == 'polyA',
tail_start_ff, tail_end_ff))
To visualize whether the coordinates of the tail end estimated by tailfindr match up with those obtained from the alignment with eGFP sequence:
p <- ggplot(dna_kr_data, aes(x = transcript_end_tf, y = transcript_alignment_start_ff)) +
geom_point(shape = 21, colour = 'black', fill = 'black',
size = 2, stroke=0, alpha = 0.1) +
geom_abline(intercept = 0, slope = 1, color="red",
linetype = 'dotted', size = 0.7) +
geom_smooth(method = 'lm',formula = y~x, color="#797979",
fullrange = TRUE, se = FALSE, size = 0.5) +
stat_cor(method = "pearson", label.x = 1000, label.y = 25000) +
coord_cartesian(xlim = c(0, 30000), ylim = c(0, 30000)) +
facet_grid(read_type~.)
To better visulize the difference, the same information is plotted as histogram:
hist_data <- mutate(dna_kr_data, diff = transcript_end_tf - transcript_alignment_start_ff)
p <- ggplot(hist_data, aes(x = diff)) +
geom_histogram(binwidth = 1) +
facet_grid(read_type~.)
First, classify the reads into erroneous and non-erroneous read
# Erroneous reads have tail length between 9 and 15
spurious_peak_data <- dna_kr_data %>%
filter(barcode == 60 | barcode == 100 | barcode == 150) %>%
mutate(read_classification =
if_else(tail_length_st >= 9 & tail_length_st <= 15,
'erroneous',
'non-erroneous'))
p <- ggplot(spurious_peak_data, aes(x = samples_per_nt_st,
color = read_classification)) +
geom_line(stat = 'density')
## quartz_off_screen
## 2
spurious_peak_data %<>%
mutate(tail_length_in_samples_st = tail_end_st - tail_start_st) %>%
mutate(barcode = as.character(barcode)) %>%
mutate(barcode = ifelse(read_classification == 'erroneous', 'erroneous', barcode)) %>%
mutate(barcode = fct_relevel(barcode, "erroneous", "60", "100", "150"))
p <- ggplot(spurious_peak_data, aes(x = tail_length_in_samples_st,
y = samples_per_nt_st,
color = barcode)) +
geom_point(alpha = 0.5, stroke = 0, size = 2)
spurious_peak_data %<>%
mutate(transcript_end_st = ifelse(read_type == 'polyA',
tail_start_st,
tail_end_st)) %>%
mutate(diff = transcript_end_st - transcript_alignment_start_st) %>%
mutate(
read_classification =
case_when(
read_type == 'polyT' &
read_classification == 'erroneous' ~ "erroneous_polyt",
read_type == 'polyT' &
read_classification == 'non-erroneous' ~ "non-erroneous_polyt",
read_type == 'polyA' &
read_classification == 'erroneous' ~ "erroneous_polya",
read_type == 'polyA' &
read_classification == 'non-erroneous' ~ "non-erroneous_polya"
)
) %>%
mutate(read_classification = fct_relevel(read_classification,
"erroneous_polyt",
"non-erroneous_polyt",
"erroneous_polya",
"non-erroneous_polya"))
p <- ggplot(spurious_peak_data, aes(x = diff,
color = read_classification)) +
geom_line(stat = 'density', size = 0.9, position = 'identity')
## quartz_off_screen
## 2
model_180mv <- here('extdata', 'r9.4_180mv_450bps_6mer_template_median68pA.model')
df_180mv <- read.csv(model_180mv, header = TRUE,
stringsAsFactors = FALSE, sep = '\t')
df_180mv <- as.tibble(df_180mv)
# sample 100 numbers from a Gaussian distribution for each kmer mu and sd
set.seed(5)
df_180mv %<>% select(kmer, level_mean, level_stdv) %>%
rowwise() %>%
mutate(sampled_numbers = list(rnorm(n=1000, level_mean, level_stdv)))
sampled_numbers <- purrr::flatten(df_180mv$sampled_numbers)
sampled_numbers <- unlist(sampled_numbers, recursive = FALSE)
# Calculate mean and sd of AAAAA state, and all the kmers combined
mean_AAAAAA <- mean(unlist(df_180mv$sampled_numbers[1]))
sd_AAAAAA <- sd(unlist(df_180mv$sampled_numbers[1]))
mean_TTTTTT <- mean(unlist(df_180mv$sampled_numbers[4096]))
sd_TTTTTT <- sd(unlist(df_180mv$sampled_numbers[4096]))
mean_all_kmers <- mean(sampled_numbers)
sd_all_kmers <- sd(sampled_numbers)
minimum_expected_nomrmalized_AAAAAA_level <-
((mean_AAAAAA - 2 * sd_AAAAAA) - mean_all_kmers) / sd_all_kmers
minimum_expected_nomrmalized_TTTTTT_level <-
((mean_TTTTTT - 2 * sd_TTTTTT) - mean_all_kmers) / sd_all_kmers
cat(paste0("Minimum expected nomrmalized AAAAAA level: ",
abs(minimum_expected_nomrmalized_AAAAAA_level), '\n'))
## Minimum expected nomrmalized AAAAAA level: 0.518144121551562
cat(paste0("Minimum expected nomrmalized TTTTTT level: ",
abs(minimum_expected_nomrmalized_TTTTTT_level), '\n'))
## Minimum expected nomrmalized TTTTTT level: 0.20178577950747
We first load RNA data
loadd(rna_kr_data)
Because RNA and DNA dataset have different number of reads in each barcode, therefore, before comparing these two datasets, we make a new dataset in which there are equal number of reads in all barcode in both RNA and DNA conditions.
polyat_dna <- select(dna_kr_data, tail_length_st, barcode) %>%
mutate(barcode = as.numeric(as.character(barcode)))
polya_rna <-select(rna_kr_data, tail_length_tf, barcode) %>%
mutate(barcode = as.numeric(as.character(barcode)))
# tabulate reads in each DNA barcode
dna_barcode_nums <- as.data.frame(table(polyat_dna$barcode))
dna_barcode_nums <-
rename(dna_barcode_nums, barcode = Var1, n_desired = Freq)
dna_barcode_nums %<>% mutate(barcode = as.numeric(as.character(barcode)))
# randomize rna data
set.seed(5)
polya_rna <- polya_rna[sample(nrow(polya_rna)),]
# inlclude only complete cases
polya_rna <- polya_rna[complete.cases(polya_rna), ]
# make a subset of RNA reads with same of reads in each barcode as that in DNA
polya_rna_subset <- left_join(polya_rna, dna_barcode_nums, by = "barcode") %>%
group_by(barcode) %>%
slice(seq(first(n_desired))) %>%
select(-n_desired)
# combine rna and dna datasets
polya_rna_subset %<>% rename(tail_length = tail_length_tf) %>%
mutate(data_type = 'RNA')
polyat_dna %<>% mutate(data_type = 'DNA') %>%
rename(tail_length = tail_length_st)
df <- bind_rows(polyat_dna, polya_rna_subset)
df %<>% dplyr::mutate(
data_type = as.factor(data_type),
barcode = as.factor(barcode),
tail_length = ifelse(tail_length > 300, 300, tail_length)
)
Now plotting the data:
p <- ggplot(data = df, aes(x = barcode, y = tail_length,
color = data_type, fill = data_type)) +
geom_two_sided_flat_violin(position = position_nudge(x = .2, y = 0), alpha = .5) +
facet_grid(~barcode, scales = 'free') +
geom_hline(aes(yintercept = as.numeric(as.character(barcode))))
## quartz_off_screen
## 2
# get a subset of data (30 and 100 nt tails)
dna_kr_data_bc_30_100 <- dna_kr_data %>%
filter(barcode == 30 | barcode == 100)
cols <- c('#8c8c8c', '#e8b00b')
p <- ggplot(dna_kr_data_bc_30_100,
aes(x = tail_length_st, y = moves_in_tail_st, colour = barcode)) +
geom_hline(yintercept = c(30, 100), color = cols,
linetype = 'dashed', size = 0.4) +
geom_vline(xintercept = c(30, 100), color = cols,
linetype = 'dashed', size = 0.4) +
geom_point(alpha = 0.09) +
theme(panel.background = element_blank(),
axis.line = element_line(color = "black", size = 0.4),
legend.direction = "horizontal",
legend.position = "bottom",
rect = element_rect(fill = "transparent")) +
xlim(0, 160) + ylim(0, 110) +
scale_x_continuous(breaks = c(0, 30, 50, 100, 150),
labels = c('0', '30', '50', '100', '150'),
limits = c(0, 160)) +
scale_y_continuous(breaks = c(0, 30, 50, 100),
labels = c('0', '30', '50', '100'),
limits = c(0, 110)) +
scale_color_manual(values = cols) +
xlab('tailfindr tail-length prediction (nt)') +
ylab('Total number of moves in the tail')
p <- ggMarginal(p, groupColour = TRUE, groupFill = TRUE)
cols <- c('#8c8c8c', '#e8b00b')
p <- ggplot(dna_kr_data_bc_30_100,
aes(x = tail_length_ff, y = moves_in_tail_ff, colour = barcode)) +
geom_hline(yintercept = c(30, 100), color = cols,
linetype = 'dashed', size = 0.4) +
geom_vline(xintercept = c(30, 100), color = cols,
linetype = 'dashed', size = 0.4) +
geom_point(alpha = 0.09) +
theme(panel.background = element_blank(),
axis.line = element_line(color = "black", size = 0.4),
legend.direction = "horizontal",
legend.position = "bottom",
rect = element_rect(fill = "transparent")) +
xlim(0, 160) + ylim(0, 110) +
scale_x_continuous(breaks = c(0, 30, 50, 100, 150),
labels = c('0', '30', '50', '100', '150'),
limits = c(0, 160)) +
scale_y_continuous(breaks = c(0, 30, 50, 100),
labels = c('0', '30', '50', '100'),
limits = c(0, 110)) +
scale_color_manual(values = cols) +
xlab('tailfindr tail-length prediction (nt)') +
ylab('Total number of moves in the tail')
p <- ggMarginal(p, groupColour = TRUE, groupFill = TRUE)